#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
智能例句筛选器 - 基于优质例句规则的筛选算法
作者: 雨婷
版本: 1.0
日期: 2025年6月19日

功能：
1. 从word_finder的输出中筛选出最优质的例句
2. 基于8条优质例句规则进行质量评分
3. 去重和智能筛选，为每个单词选择最佳例句
"""

import os
import re
import csv
import json
import math
from collections import defaultdict, Counter
from typing import List, Dict, Tuple, Set
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords

# 下载必要的NLTK数据
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')


class SentenceQualityEvaluator:
    """句子质量评估器"""
    
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        # 基础词汇列表（假设儿童已知的常见词汇）
        self.basic_vocabulary = {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
            'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
            'my', 'your', 'his', 'her', 'its', 'our', 'their', 'this', 'that', 'these', 'those',
            'is', 'am', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
            'do', 'does', 'did', 'will', 'would', 'can', 'could', 'should', 'may', 'might',
            'go', 'come', 'get', 'put', 'take', 'give', 'make', 'see', 'look', 'like', 'want',
            'good', 'bad', 'big', 'small', 'new', 'old', 'happy', 'sad', 'nice', 'little'
        }
        
        # 动词短语词典（用于区分介词和短语动词）
        self.phrasal_verbs = {
            'look after', 'look at', 'look for', 'look up', 'look out',
            'get up', 'get on', 'get off', 'get in', 'get out',
            'put on', 'put off', 'put up', 'put down',
            'turn on', 'turn off', 'turn up', 'turn down'
        }
        
        # 对白标识词（通常出现在对白中的词汇）
        self.dialogue_indicators = {
            'i', 'you', 'me', 'my', 'your', 'let', 'please', 'hello', 'hi', 'bye', 'goodbye',
            'yes', 'no', 'ok', 'okay', 'well', 'oh', 'ah', 'wow', 'hey'
        }

    def evaluate_completeness(self, sentence: str) -> float:
        """评估内容完整性 (规则1) - 满分20分"""
        score = 0
        
        # 检查是否是完整句子（有主语和谓语）
        tokens = word_tokenize(sentence.lower())
        pos_tags = pos_tag(tokens)
        
        has_subject = any(tag.startswith('PRP') or tag.startswith('NN') for _, tag in pos_tags)
        has_verb = any(tag.startswith('VB') for _, tag in pos_tags)
        
        if has_subject and has_verb:
            score += 10
        elif has_subject or has_verb:
            score += 5
            
        # 检查句子长度（太短可能不完整）
        if len(tokens) >= 4:
            score += 5
            
        # 检查是否以句号、问号或感叹号结尾
        if sentence.strip().endswith(('.', '!', '?')):
            score += 5
            
        return min(score, 20)

    def evaluate_length(self, sentence: str) -> float:
        """评估句子长度 (规则2) - 满分15分"""
        tokens = word_tokenize(sentence)
        length = len(tokens)
        
        if 4 <= length <= 15:
            # 最佳长度范围，给满分
            return 15
        elif length < 4:
            # 太短，按比例扣分
            return max(0, length * 3.75)  # 4个词=15分
        else:
            # 太长，按比例扣分
            excess = length - 15
            penalty = min(excess * 1.5, 15)  # 每超出1个词扣1.5分
            return max(0, 15 - penalty)

    def evaluate_word_clarity(self, sentence: str, target_word: str) -> float:
        """评估核心词义清晰度 (规则3) - 满分20分"""
        score = 0
        sentence_lower = sentence.lower()
        target_lower = target_word.lower()
        
        # 检查目标词是否作为独立单词出现
        words = word_tokenize(sentence_lower)
        if target_lower in words:
            score += 10
        
        # 检查是否在短语动词中（如果目标是介词，在短语动词中出现会降分）
        for phrasal_verb in self.phrasal_verbs:
            if phrasal_verb in sentence_lower and target_lower in phrasal_verb.split():
                # 如果目标词是介词，但出现在短语动词中，扣分
                if target_lower in ['after', 'up', 'on', 'off', 'out', 'in', 'at', 'for']:
                    score -= 5
                break
        else:
            # 目标词不在短语动词中，如果是介词则加分
            if target_lower in ['after', 'up', 'on', 'off', 'out', 'in', 'at', 'for']:
                score += 5
                
        # 检查词汇在句子中的重要性（不是停用词）
        if target_lower not in self.stop_words:
            score += 5
            
        return min(score, 20)

    def evaluate_difficulty(self, sentence: str, target_word: str) -> float:
        """评估难度适中性 (规则4) - 满分15分"""
        tokens = word_tokenize(sentence.lower())
        
        # 计算非基础词汇数量（排除目标词）
        difficult_words = 0
        for token in tokens:
            if (token.isalpha() and 
                token not in self.basic_vocabulary and 
                token != target_word.lower() and
                len(token) > 2):
                difficult_words += 1
        
        # 基于难词数量评分
        if difficult_words <= 2:
            difficulty_score = 10
        elif difficult_words <= 4:
            difficulty_score = 7
        elif difficult_words <= 6:
            difficulty_score = 4
        else:
            difficulty_score = 0
            
        # 检查句法复杂度
        pos_tags = pos_tag(tokens)
        complex_structures = 0
        
        # 检查复杂结构标志
        for token, tag in pos_tags:
            if token in ['that', 'which', 'who', 'when', 'where', 'because', 'although']:
                complex_structures += 1
                
        syntax_score = max(0, 5 - complex_structures)
        
        return difficulty_score + syntax_score

    def evaluate_positivity(self, sentence: str) -> float:
        """评估内容正向性与关联性 (规则5) - 满分10分"""
        sentence_lower = sentence.lower()
        
        # 积极词汇
        positive_words = {
            'love', 'like', 'happy', 'good', 'great', 'wonderful', 'nice', 'fun', 'play',
            'laugh', 'smile', 'enjoy', 'beautiful', 'lovely', 'best', 'favorite'
        }
        
        # 消极词汇
        negative_words = {
            'hate', 'angry', 'bad', 'terrible', 'awful', 'sad', 'cry', 'hurt', 'pain',
            'scary', 'afraid', 'worried', 'sick', 'broken'
        }
        
        # 儿童相关词汇
        child_related = {
            'play', 'game', 'toy', 'family', 'mummy', 'daddy', 'brother', 'sister',
            'school', 'friend', 'home', 'garden', 'park', 'birthday', 'cake'
        }
        
        score = 5  # 基础分
        
        # 检查积极词汇
        for word in positive_words:
            if word in sentence_lower:
                score += 2
                break
                
        # 检查消极词汇
        for word in negative_words:
            if word in sentence_lower:
                score -= 3
                break
                
        # 检查儿童相关性
        for word in child_related:
            if word in sentence_lower:
                score += 3
                break
                
        return min(max(score, 0), 10)

    def evaluate_diversity(self, sentence: str, existing_patterns: Set[str]) -> float:
        """评估句式多样性 (规则6) - 满分10分"""
        # 简化的句式模式识别
        tokens = word_tokenize(sentence.lower())
        pos_tags = pos_tag(tokens)
        
        # 提取句式模式（简化版）
        pattern = []
        for token, tag in pos_tags[:5]:  # 只看前5个词的模式
            if tag.startswith('VB'):
                pattern.append('VERB')
            elif tag.startswith('NN'):
                pattern.append('NOUN')
            elif tag.startswith('PRP'):
                pattern.append('PRON')
            elif tag.startswith('JJ'):
                pattern.append('ADJ')
            elif token in ['the', 'a', 'an']:
                pattern.append('DET')
                
        pattern_str = '-'.join(pattern)
        
        # 如果是新模式，给高分
        if pattern_str not in existing_patterns:
            existing_patterns.add(pattern_str)
            return 10
        else:
            return 3  # 重复模式给低分

    def evaluate_dialogue_priority(self, sentence: str) -> float:
        """评估对白优先原则 (规则7) - 满分5分"""
        sentence_lower = sentence.lower()
        
        # 检查对白标识
        dialogue_score = 0
        for indicator in self.dialogue_indicators:
            if indicator in word_tokenize(sentence_lower):
                dialogue_score += 1
                
        # 检查引号或对话标点
        if '"' in sentence or "'" in sentence:
            dialogue_score += 2
            
        # 检查疑问句或感叹句
        if sentence.strip().endswith(('?', '!')):
            dialogue_score += 1
            
        return min(dialogue_score, 5)

    def evaluate_contextual_richness(self, sentence: str) -> float:
        """评估情景丰富度 (规则8) - 满分5分"""
        tokens = word_tokenize(sentence.lower())
        
        # 计算信息密度
        content_words = [token for token in tokens 
                        if token.isalpha() and token not in self.stop_words]
        
        # 检查描述性元素
        pos_tags = pos_tag(tokens)
        descriptive_elements = 0
        
        for token, tag in pos_tags:
            if tag.startswith('JJ'):  # 形容词
                descriptive_elements += 1
            elif tag.startswith('RB'):  # 副词
                descriptive_elements += 1
                
        # 基于内容词数量和描述性元素评分
        richness_score = min(len(content_words) * 0.5 + descriptive_elements, 5)
        
        return richness_score

    def calculate_total_score(self, sentence: str, target_word: str, 
                            existing_patterns: Set[str]) -> Dict[str, float]:
        """计算句子的总质量分数"""
        scores = {
            'completeness': self.evaluate_completeness(sentence),
            'length': self.evaluate_length(sentence),
            'word_clarity': self.evaluate_word_clarity(sentence, target_word),
            'difficulty': self.evaluate_difficulty(sentence, target_word),
            'positivity': self.evaluate_positivity(sentence),
            'diversity': self.evaluate_diversity(sentence, existing_patterns),
            'dialogue': self.evaluate_dialogue_priority(sentence),
            'richness': self.evaluate_contextual_richness(sentence)
        }
        
        # 计算加权总分
        weights = {
            'completeness': 0.20,
            'length': 0.15,
            'word_clarity': 0.20,
            'difficulty': 0.15,
            'positivity': 0.10,
            'diversity': 0.10,
            'dialogue': 0.05,
            'richness': 0.05
        }
        
        total_score = sum(scores[key] * weights[key] for key in scores)
        scores['total'] = total_score
        
        return scores


class SentenceSelector:
    """智能例句筛选器"""
    
    def __init__(self, input_csv_path: str, output_dir: str = "data/selected_sentences"):
        self.input_csv_path = input_csv_path
        self.output_dir = output_dir
        self.evaluator = SentenceQualityEvaluator()
        
        # 确保输出目录存在
        os.makedirs(self.output_dir, exist_ok=True)
        
        # 存储数据
        self.raw_data = []
        self.word_sentences = defaultdict(list)
        self.selected_sentences = {}
        
    def load_data(self):
        """加载CSV数据"""
        print("📖 加载匹配数据...")
        
        with open(self.input_csv_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.raw_data.append(row)
                word = row['target_word'].lower()
                self.word_sentences[word].append(row)
        
        print(f"✓ 加载完成：{len(self.raw_data)} 条匹配记录，{len(self.word_sentences)} 个不同单词")

    def deduplicate_sentences(self):
        """去重处理：合并相同句子的信息"""
        print("🔄 进行句子去重处理...")

        for word, sentences in self.word_sentences.items():
            # 按句子内容分组
            sentence_groups = defaultdict(list)
            for sentence_data in sentences:
                clean_sentence = sentence_data['sentence'].strip()
                sentence_groups[clean_sentence].append(sentence_data)

            # 合并重复句子的信息
            deduplicated = []
            for sentence_text, group in sentence_groups.items():
                if len(group) == 1:
                    deduplicated.append(group[0])
                else:
                    # 合并多个相同句子的信息
                    merged = group[0].copy()
                    source_files = set()
                    timestamps = set()

                    for item in group:
                        source_files.add(item['source_file'])
                        timestamps.add(item['timestamp'])

                    merged['source_file'] = ', '.join(sorted(source_files))
                    merged['timestamp'] = ', '.join(sorted(timestamps))
                    merged['frequency'] = len(group)  # 添加出现频次

                    deduplicated.append(merged)

            self.word_sentences[word] = deduplicated

        total_after_dedup = sum(len(sentences) for sentences in self.word_sentences.values())
        print(f"✓ 去重完成：{len(self.raw_data)} → {total_after_dedup} 条记录")

    def select_best_sentences(self, max_sentences_per_word: int = 3):
        """为每个单词选择最佳例句"""
        print(f"🎯 开始智能筛选，每个单词最多选择 {max_sentences_per_word} 个例句...")

        used_sentences = set()  # 避免重复使用相同句子
        word_patterns = defaultdict(set)  # 记录每个单词已使用的句式模式

        for word, sentences in self.word_sentences.items():
            print(f"   处理单词: {word} ({len(sentences)} 个候选句子)")

            # 为每个句子计算质量分数
            scored_sentences = []
            for sentence_data in sentences:
                sentence_text = sentence_data['sentence']

                # 跳过已被其他单词使用的句子
                if sentence_text in used_sentences:
                    continue

                scores = self.evaluator.calculate_total_score(
                    sentence_text, word, word_patterns[word]
                )

                sentence_data['scores'] = scores
                sentence_data['total_score'] = scores['total']
                scored_sentences.append(sentence_data)

            # 按总分排序
            scored_sentences.sort(key=lambda x: x['total_score'], reverse=True)

            # 选择最佳句子
            selected = []
            for sentence_data in scored_sentences[:max_sentences_per_word * 2]:  # 多选一些候选
                sentence_text = sentence_data['sentence']

                # 避免选择过于相似的句子
                if self._is_too_similar(sentence_text, selected):
                    continue

                selected.append(sentence_data)
                used_sentences.add(sentence_text)

                if len(selected) >= max_sentences_per_word:
                    break

            self.selected_sentences[word] = selected
            print(f"      → 选择了 {len(selected)} 个例句")

        total_selected = sum(len(sentences) for sentences in self.selected_sentences.values())
        print(f"✓ 筛选完成：总共选择了 {total_selected} 个优质例句")

    def _is_too_similar(self, new_sentence: str, existing_sentences: List[Dict]) -> bool:
        """检查新句子是否与已选句子过于相似"""
        if not existing_sentences:
            return False

        new_words = set(word_tokenize(new_sentence.lower()))

        for existing in existing_sentences:
            existing_words = set(word_tokenize(existing['sentence'].lower()))

            # 计算词汇重叠度
            overlap = len(new_words & existing_words) / len(new_words | existing_words)

            # 如果重叠度超过70%，认为过于相似
            if overlap > 0.7:
                return True

        return False

    def save_results(self):
        """保存筛选结果"""
        print("💾 保存筛选结果...")

        # 保存详细结果（包含评分信息）
        detailed_output = os.path.join(self.output_dir, 'selected_sentences_detailed.csv')
        with open(detailed_output, 'w', newline='', encoding='utf-8') as f:
            fieldnames = [
                'target_word', 'sentence', 'source_file', 'timestamp', 'frequency',
                'total_score', 'completeness_score', 'length_score', 'word_clarity_score',
                'difficulty_score', 'positivity_score', 'diversity_score',
                'dialogue_score', 'richness_score'
            ]
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()

            for word, sentences in self.selected_sentences.items():
                for sentence_data in sentences:
                    row = {
                        'target_word': word,
                        'sentence': sentence_data['sentence'],
                        'source_file': sentence_data['source_file'],
                        'timestamp': sentence_data['timestamp'],
                        'frequency': sentence_data.get('frequency', 1),
                        'total_score': round(sentence_data['total_score'], 2)
                    }

                    # 添加各项评分
                    scores = sentence_data['scores']
                    for score_type in ['completeness', 'length', 'word_clarity', 'difficulty',
                                     'positivity', 'diversity', 'dialogue', 'richness']:
                        row[f'{score_type}_score'] = round(scores[score_type], 2)

                    writer.writerow(row)

        # 保存简化结果（仅包含基本信息）
        simple_output = os.path.join(self.output_dir, 'selected_sentences_simple.csv')
        with open(simple_output, 'w', newline='', encoding='utf-8') as f:
            fieldnames = ['target_word', 'sentence', 'source_file', 'total_score']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()

            for word, sentences in self.selected_sentences.items():
                for sentence_data in sentences:
                    writer.writerow({
                        'target_word': word,
                        'sentence': sentence_data['sentence'],
                        'source_file': sentence_data['source_file'],
                        'total_score': round(sentence_data['total_score'], 2)
                    })

        print(f"✓ 详细结果保存到: {detailed_output}")
        print(f"✓ 简化结果保存到: {simple_output}")

    def generate_report(self):
        """生成筛选报告"""
        print("📊 生成筛选报告...")

        report_path = os.path.join(self.output_dir, 'selection_report.txt')

        with open(report_path, 'w', encoding='utf-8') as f:
            f.write("=" * 60 + "\n")
            f.write("智能例句筛选报告\n")
            f.write("=" * 60 + "\n\n")

            # 总体统计
            total_words = len(self.selected_sentences)
            total_sentences = sum(len(sentences) for sentences in self.selected_sentences.values())
            avg_sentences_per_word = total_sentences / total_words if total_words > 0 else 0

            f.write("📊 总体统计\n")
            f.write("-" * 30 + "\n")
            f.write(f"处理单词总数: {total_words}\n")
            f.write(f"筛选例句总数: {total_sentences}\n")
            f.write(f"平均每词例句数: {avg_sentences_per_word:.1f}\n\n")

            # 质量分数统计
            all_scores = []
            for sentences in self.selected_sentences.values():
                for sentence_data in sentences:
                    all_scores.append(sentence_data['total_score'])

            if all_scores:
                f.write("🎯 质量分数统计\n")
                f.write("-" * 30 + "\n")
                f.write(f"平均质量分: {sum(all_scores) / len(all_scores):.2f}\n")
                f.write(f"最高质量分: {max(all_scores):.2f}\n")
                f.write(f"最低质量分: {min(all_scores):.2f}\n\n")

            # 按单词显示筛选结果
            f.write("📝 按单词筛选结果\n")
            f.write("-" * 50 + "\n")
            f.write(f"{'单词':<15} {'例句数':<8} {'平均分':<8} {'最高分':<8}\n")
            f.write("-" * 50 + "\n")

            for word in sorted(self.selected_sentences.keys()):
                sentences = self.selected_sentences[word]
                scores = [s['total_score'] for s in sentences]
                avg_score = sum(scores) / len(scores) if scores else 0
                max_score = max(scores) if scores else 0

                f.write(f"{word:<15} {len(sentences):<8} {avg_score:<8.2f} {max_score:<8.2f}\n")

        print(f"✓ 筛选报告保存到: {report_path}")

    def run(self):
        """运行完整的筛选流程"""
        print("🚀 启动智能例句筛选器")
        print("=" * 50)

        # 执行筛选流程
        self.load_data()
        self.deduplicate_sentences()
        self.select_best_sentences()
        self.save_results()
        self.generate_report()

        print("\n" + "=" * 50)
        print("✅ 智能筛选完成！")
        print(f"📁 结果文件保存在: {self.output_dir}")


def main():
    """主函数"""
    input_csv = "data/match_results/output.csv"
    output_dir = "data/selected_sentences"

    if not os.path.exists(input_csv):
        print(f"❌ 错误：找不到输入文件 {input_csv}")
        print("请先运行 word_finder 生成匹配数据")
        return

    try:
        selector = SentenceSelector(input_csv, output_dir)
        selector.run()
    except KeyboardInterrupt:
        print("\n\n⚠️  程序被用户中断")
    except Exception as e:
        print(f"\n❌ 程序运行出错: {e}")


if __name__ == "__main__":
    main()
